#!/usr/bin/perl
use strict;

sub step_1  {#create path and down index pages
	system("mkdir -p page/index");
	foreach my $idx (1 .. 34) {
		my $url = "http://tv.youku.com/new/new/_page16113_$idx.html";
		system("wget $url -O ./page/index/$idx.html");
	}

}

sub step_2  {#download detail pages
	foreach my $idx (1 .. 34) {
		my $line = `cat page/index/$idx.html | grep p_link`;
		while ($line =~ /href="(.+?)" charset/g) {
			my $tv_url = $1;
			if ($tv_url =~ /show_page\/(.+)$/) {
				my $tv_id_file = $1;
				system("wget $tv_url -O ./page/$idx/$tv_id_file");
			}
		}
	}
}

sub step_3  {#download detail pages
	foreach my $idx (1 .. 34) {
		my $line = `cat page/index/$idx.html | grep p_link`;
		while ($line =~ /href="(.+?)" charset/g) {
			my $tv_url = $1;
			if ($tv_url =~ /show_page\/(.+)$/) {
				my $tv_id_file = $1;
				#print $tv_id_file, "\n";
				if ($tv_id_file =~ /(.+)\.html/g) {
					my $cid = $1;
					#print $cid, "\n";
					#`mkdir -p page/$idx/$cid`;
					my $page_content = `cat ./page/$idx/$tv_id_file`;
					while ($page_content =~ /\<li\>\<a href\=\"http\:\/\/v\.youku\.com\/v_show\/(.+?)\.html/g) {
						my $vid = $1;
						#print "$vid\n";
						#system("wget http://v.youku.com/v_show/$vid.html -O page/$idx/$cid/$vid.html");
						print "wget http://v.youku.com/v_show/$vid.html -O page/$idx/$cid/$vid.html\n";
				}
				}
#<li><a href="http://v.youku.com/v_show/id_XNTUxMTE1MDA4.html" title="唐山大地震 01" charset="411-5-1" target="_blank">1</a></li>
			}
		}
	}
}

sub step_4  {#generate image download cmd
	foreach my $idx (1 .. 34) {
		my $line = `cat page/index/$idx.html | grep p_link`;
		while ($line =~ /href="(.+?)" charset/g) {
			my $tv_url = $1;
			if ($tv_url =~ /show_page\/(.+)$/) {
				my $tv_id_file = $1;
				#print $tv_id_file, "\n";
				if ($tv_id_file =~ /(.+)\.html/g) {
					my $cid = $1;
					`mkdir -p ./parse/$idx/$cid`;
					my $page_content = `cat ./page/$idx/$tv_id_file`;
					while ($page_content =~ /\<li\>\<a href\=\"http\:\/\/v\.youku\.com\/v_show\/(.+?)\.html/g) {
						my $vid = $1;
					my $vid_page = "page/$idx/$cid/$vid.html";
					my $url_esc = "http://v.youku.com/v_show/$vid.html";
					#print "$url_esc\n";
					print "wget \"www.flvcd.com/parse.php?format=&kw=http://v.youku.com/v_show/$vid.html\" -O ./parse/$idx/$cid/$vid.html\n";
				}
				}
			}
		}
	}

}

sub step_5  {#generate image download cmd
	foreach my $idx (1 .. 34) {
		my $line = `cat page/index/$idx.html | grep p_link`;
		while ($line =~ /href="(.+?)" charset/g) {
			my $tv_url = $1;
			if ($tv_url =~ /show_page\/(.+)$/) {
				my $tv_id_file = $1;
				#print $tv_id_file, "\n";
				if ($tv_id_file =~ /(.+)\.html/g) {
					my $cid = $1;
					`mkdir -p ./parse/$idx/$cid`;
					my $page_content = `cat ./page/$idx/$tv_id_file`;
					while ($page_content =~ /\<li\>\<a href\=\"http\:\/\/v\.youku\.com\/v_show\/(.+?)\.html/g) {
						my $vid = $1;
					my $url_page = "parse/$idx/$cid/$vid.html";
					my $content = `cat $url_page`;
					$content =~ s/\n/XXXX/g;
					$content =~ s/\r/XXXX/g;
					if ($content =~ /\<input type\=\"hidden\" name\=\"inf\" value\=\"(.+?)\"/) {
						my $all_url = $1;
				#		print $all_url, "\n";
					while ($all_url =~ /(.+?)XXXXXXXX/g) {
						print $1, "\n";
					}
					}
					#print "$content\n";
				}
				}
			}
		}
	}

}

sub main {
	if (scalar(@ARGV)<1) {
		print "usage: ./get_html_linux.pl step_id\n";
		print "example: ./get_html_linux.pl 1\n";
		print "step_id meaning: 1-download index page, 2-download detail page, 3-generate image download cmd\n";
		exit(0);
	}
	if ($ARGV[0] == 1) { #1-download index page
		step_1();
		exit(0);
	}
	if ($ARGV[0] == 2) { #2-download detail page
		step_2();
		exit(0);
	}
	if ($ARGV[0] == 3) { #3-generate image download cmd
		step_3();
		exit(0);
	}
	if ($ARGV[0] == 4) { #3-generate image download cmd
		step_4();
		exit(0);
	}
	if ($ARGV[0] == 5) { #3-generate image download cmd
		step_5();
		exit(0);
	}
}

main();

